cd /athena/angsd/scratch/naw4005
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8367773/SRR8367773.fastq.gz
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8367773/SRR8367773_1.fastq.gz
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR836/003/SRR8367773/SRR8367773_2.fastq.gz
SRR8367773_1 and SRR8367773_2 represent paired end reads. SRR8367773
has both in a single file.
Download the hg38 reference genome and gtf file.
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name="download_gen"
#SBATCH --time=24:0:00 # HH/MM/SS
#SBATCH --mem=4G # memory requested, units available: K,M,G,T
#SBATCH --mail-user=naw4005@med.cornell.edu
#SBATCH --mail-type=ALL
#SBATCH --requeue
if [ ! -r /athena/angsd/scratch/naw4005/hg38.fa.gz ] && [ ! -r /athena/angsd/scratch/naw4005/hg38.fa ]; then
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz -P /athena/angsd/scratch/naw4005/
fi
if [ ! -r /athena/angsd/scratch/naw4005/hg38.ensGene.gtf.gz ] && [ ! -r /athena/angsd/scratch/naw4005/hg38.ensGene.gtf ]; then
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.ensGene.gtf.gz -P /athena/angsd/scratch/naw4005/
fi
Unzip genome
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1#SBATCH --ntasks=1
#SBATCH --job-name="unzip"
#SBATCH --time=24:0:00 # HH/MM/SS
#SBATCH --mem=4G # memory requested, units available: K,M,G,T
#SBATCH --mail-user=naw4005@med.cornell.edu
#SBATCH --mail-type=ALL
#SBATCH --requeue
if [ -r /athena/angsd/scratch/naw4005/hg38.fa.gz ] && [ ! -r /athena/angsd/scratch/naw4005/hg38.fa ]; then
gunzip /athena/angsd/scratch/naw4005/hg38.fa.gz
fi
if [ -r /athena/angsd/scratch/naw4005/hg38.ensGene.gtf.gz ] && [ ! -r /athena/angsd/scratch/naw4005/hg38.ensGene.gtf ]; then
gunzip /athena/angsd/scratch/naw4005/hg38.ensGene.gtf.gz
fi
Generate genome index
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1#SBATCH --ntasks=1
#SBATCH --job-name="gen_ind"
#SBATCH --cpus-per-task=8
#SBATCH --time=24:0:00 # HH/MM/SS
#SBATCH --mem=88G # memory requested, units available: K,M,G,T
#SBATCH --mail-user=naw4005@med.cornell.edu
#SBATCH --mail-type=ALL
#SBATCH --requeue
mamba activate angsd
# read in comman line arguments
dir_name=$1
fasta_files=$2
GTF_file=$3
read_len=$4
arg_count=$#
# calculate overhang length
overhang=$(($read_len - 1))
# check we have the right number of command line arguments
if [ $arg_count -lt 4 ]; then
echo "Not enough command line arguments. Exiting ..."
echo "Usage: generate_ind.sh <STAR_alignment_dir> <genome_fasta_file> <GTF_file> <read_length>"
exit
fi
# make directory if it does not exist
if [ ! -d ${dir_name} ]; then
mkdir ${dir_name}
fi
# generate genome index
STAR --runMode genomeGenerate\
--runThreadN 8\
--genomeDir ${dir_name}\
--genomeFastaFiles ${fasta_files}\
--sjdbGTFfile ${GTF_file}\
--sjdbOverhang ${overhang}
Run script
sbatch generate_ind.sh /athena/angsd/scratch/naw4005/hg38_ind /athena/angsd/scratch/naw4005/hg38.fa /athena/angsd/scratch/naw4005/hg38.ensGene.gtf 50
Run fastQC
srun -n1 --pty --partition=angsd_class --mem=4G bash -i
mamba activate angsd
fastqc /athena/angsd/scratch/naw4005/SRR8367773_1.fastq.gz --extract
fastqc /athena/angsd/scratch/naw4005/SRR8367773_2.fastq.gz --extract
The results of fastQC show that the both fastq files have fairly good sequence quality.
The only issue we see is with GC content.
This implies there is some adapter contamination. Since STAR
automatically performs softclipping for adapter contamination, I felt
that the reads were good enough quality to perform alignment.
Align to reference genome
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --job-name="align"
#SBATCH --time=24:0:00 # HH/MM/SS
#SBATCH --mem=40G # memory requested, units available: K,M,G,T
#SBATCH --mail-user=naw4005@med.cornell.edu
#SBATCH --mail-type=ALL
#SBATCH --requeue
mamba activate angsd
# read in comman line arguments
gen_dir=$1
fastq_files=$2
out_dir_prefix=$3
arg_count=$#
out_dir="$(dirname "${out_dir_prefix}")"
# check we have the right number of command line arguments
if [ $arg_count -lt 3 ]; then
echo "Not enough command line arguments. Exiting ..."
echo "Usage: align.sh <genome_index_directory> <fastq_files> <out_directory_prefix>"
exit
fi
# make directory if it does not exist
if [ ! -d ${out_dir} ]; then
mkdir ${out_dir}
fi
STAR --runMode alignReads \
--runThreadN 8 \
--genomeDir ${gen_dir} \
--readFilesIn ${fastq_files} \
--readFilesCommand zcat \
--outFileNamePrefix ${out_dir_prefix} \
--outSAMtype BAM SortedByCoordinate \
--alignIntronMax 2700000 \
--alignIntronMin 3 \
--soloStrand forward
Run alignment
sbatch align.sh "/athena/angsd/scratch/naw4005/hg38_ind" "/athena/angsd/scratch/naw4005/SRR8367773_1.fastq.gz /athena/angsd/scratch/naw4005/SRR8367773_2.fastq.gz" "/athena/angsd/scratch/naw4005/alignments/SRR8367773."
Parameters changed: I added both reads in separated by a space because these are paired end reads. I chose for outSAMtype to be a BAM file sorted by coordinate. I changed alignIntronMax parameter to be 2700000 because the longest intron in the human genome is 2.7 million bp long. I changed alignIntronMin parameters to be 3. The default is 20, but I wanted to include introns smaller than 10 bp. I changed soloStrand to be forward since the library prep is a stranded protocol where the strand that is read is only in the forward direction of the original mRNA strand. I chose the outSAMtype to be be a BAM file that is sorted by coordinate because BAM files are compressed and because the sorting makes it easier to work with after alignment.
Summary of outcome and basic QC
mamba activate angsd
samtools view -H /athena/angsd/scratch/naw4005/alignments/SRR8367773.Aligned.sortedByCoord.out.bam | less
samtools index /athena/angsd/scratch/naw4005/alignments/SRR8367773.Aligned.sortedByCoord.out.bam
# basic QC using samtools
samtools flagstat /athena/angsd/scratch/naw4005/alignments/SRR8367773.Aligned.sortedByCoord.out.bam > SRR8367773.flagstats
samtools stats /athena/angsd/scratch/naw4005/alignments/SRR8367773.Aligned.sortedByCoord.out.bam > SRR8367773.stats
The flagstats file shows that 65005846 reads passed QC and 0 reads failed QC. It also shows that 39421296 QC-passed reads were properly paired 0 QC-failed reads were properly paired. There were 0 singletons and 0 reads with mate mapped to a different chromosome. It seems that alignment passed the necessary QC checks.